new utf8proc_map_custom for hooking in user-defined custom mappings (#89)

author Steven G. Johnson <stevenj@mit.edu>

Wed, 30 Nov 2016 15:40:26 +0000 (10:40 -0500)

committer GitHub <noreply@github.com>

Wed, 30 Nov 2016 15:40:26 +0000 (10:40 -0500)
author Steven G. Johnson <stevenj@mit.edu>
Wed, 30 Nov 2016 15:40:26 +0000 (10:40 -0500)
committer GitHub <noreply@github.com>
Wed, 30 Nov 2016 15:40:26 +0000 (10:40 -0500)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 5e9b8a1888c136c2d40190fa6f11231bdd1fd42e..be676ba26800c0e8ff745cc1a0622b9c210595c3 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,8 +10,8 @@ project (utf8proc C)
  # API version number (defined in utf8proc.h).
  # Be sure to also update these in Makefile and MANIFEST!
  set(SO_MAJOR 2)
-set(SO_MINOR 0)
-set(SO_PATCH 2)
+set(SO_MINOR 1)
+set(SO_PATCH 0)
  
  add_definitions (
    -DUTF8PROC_EXPORTS
diff --git a/MANIFEST b/MANIFEST

index 106a4f0b9ab875abe10277ce52caed566fd76b39..b39f8a81b3364125f059ce26191ce22ed8984e6a 100644 (file)
--- a/MANIFEST
+++ b/MANIFEST
@@ -2,6 +2,6 @@ include/
  include/utf8proc.h
  lib/
  lib/libutf8proc.a
-lib/libutf8proc.so -> libutf8proc.so.2.0.2
-lib/libutf8proc.so.2 -> libutf8proc.so.2.0.2
-lib/libutf8proc.so.2.0.2
+lib/libutf8proc.so -> libutf8proc.so.2.1.0
+lib/libutf8proc.so.2 -> libutf8proc.so.2.1.0
+lib/libutf8proc.so.2.1.0
diff --git a/Makefile b/Makefile

index 2bde4b1e2622bf68bbee88f777cf43f5c030d798..51995c3889b87651e68929bd54362c71a25764df 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -21,8 +21,8 @@ UCFLAGS = $(CFLAGS) $(PICFLAG) $(C99FLAG) $(WCFLAGS) -DUTF8PROC_EXPORTS
  # The API version number is defined in utf8proc.h.
  # Be sure to also update these ABI versions in MANIFEST and CMakeLists.txt!
  MAJOR=2
-MINOR=0
-PATCH=2
+MINOR=1
+PATCH=0
  
  OS := $(shell uname)
  ifeq ($(OS),Darwin) # MacOS X
@@ -49,7 +49,7 @@ clean:
  ifneq ($(OS),Darwin)
         rm -f libutf8proc.so.$(MAJOR)
  endif
-       rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case
+       rm -f test/tests.o test/normtest test/graphemetest test/printproperty test/charwidth test/valid test/iterate test/case test/custom
         rm -rf MANIFEST.new tmp
         $(MAKE) -C bench clean
         $(MAKE) -C data clean
@@ -136,7 +136,10 @@ test/iterate: test/iterate.c test/tests.o utf8proc.o utf8proc.h test/tests.h
  test/case: test/case.c test/tests.o utf8proc.o utf8proc.h test/tests.h
         $(CC) $(UCFLAGS) test/case.c test/tests.o utf8proc.o -o $@
  
-check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+test/custom: test/custom.c test/tests.o utf8proc.o utf8proc.h test/tests.h
+       $(CC) $(UCFLAGS) test/custom.c test/tests.o utf8proc.o -o $@
+
+check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/custom test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
         $(MAKE) -C bench
         test/normtest data/NormalizationTest.txt
         test/graphemetest data/GraphemeBreakTest.txt
@@ -144,3 +147,4 @@ check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeB
         test/valid
         test/iterate
         test/case
+       test/custom
diff --git a/NEWS.md b/NEWS.md

index a4e5321af0f185355c0b99ea451857f076216073..663e5067988a04729b8cf9208064e04c97b3f9ed 100644 (file)
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,17 @@
  # utf8proc release history #
  
+## Version 2.1 (not yet released) ##
+
+- New functions `utf8proc_map_custom` and `utf8proc_decompose_custom`
+  to allow user-supplied transformations of codepoints, in conjunction
+  with other transformations ([#89]).
+
+- New function `utf8proc_normalize_utf32` to apply normalizations
+  directly to UTF-32 data (not just UTF-8) ([#88]).
+
+- Fixed stack overflow that could occur due to incorrect definition
+  of `UINT16_MAX` with some compilers ([#84]).
+
  ## Version 2.0.2 ##
  
  2016-07-27:
@@ -279,3 +291,6 @@ Release of version 1.0.1
  [#78]: https://github.com/JuliaLang/utf8proc/issues/78
  [#79]: https://github.com/JuliaLang/utf8proc/issues/79
  [#80]: https://github.com/JuliaLang/utf8proc/issues/80
+[#84]: https://github.com/JuliaLang/utf8proc/pull/84
+[#88]: https://github.com/JuliaLang/utf8proc/pull/88
+[#89]: https://github.com/JuliaLang/utf8proc/pull/89
diff --git a/test/custom.c b/test/custom.c

new file mode 100644 (file)

index 0000000..f85b3cc
--- /dev/null
+++ b/test/custom.c
@@ -0,0 +1,27 @@
+#include "tests.h"
+
+static int thunk_test = 1;
+
+static utf8proc_int32_t custom(utf8proc_int32_t codepoint, void *thunk)
+{
+    check(((int *) thunk) == &thunk_test, "unexpected thunk passed");
+    if (codepoint == 'a')
+        return 'b';
+    if (codepoint == 'S')
+        return 0x00df; /* ß */
+    return codepoint;
+}
+
+int main(void)
+{
+    utf8proc_uint8_t input[] = {0x41,0x61,0x53,0x62,0xef,0xbd,0x81,0x00}; /* "AaSb\uff41" */
+    utf8proc_uint8_t correct[] = {0x61,0x62,0x73,0x73,0x62,0x61,0x00}; /* "abssba" */
+    utf8proc_uint8_t *output;
+    utf8proc_map_custom(input, 0, &output, UTF8PROC_CASEFOLD | UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_NULLTERM,
+                        custom, &thunk_test);
+    printf("mapped \"%s\" -> \"%s\"\n", (char*)input, (char*)output);
+    check(strlen((char*) output) == 6, "incorrect output length");
+    check(!memcmp(correct, output, 7), "incorrect output data");
+    free(output);
+    return 0;
+}
diff --git a/utf8proc.c b/utf8proc.c

index 74886330ab68644d827bdd7cccd6790fe5991bf6..c14bbe13fc1eeed5a9ac3f590eff4f2673fb0edc 100644 (file)
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -391,8 +391,6 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
    return s[utf8proc_category(c)];
  }
  
-
-
  #define utf8proc_decompose_lump(replacement_uc) \
    return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
    options & ~UTF8PROC_LUMP, last_boundclass)
@@ -485,6 +483,14 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc,
  UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
    const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
    utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
+) {
+    return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
  ) {
    /* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
    utf8proc_ssize_t wpos = 0;
@@ -511,6 +517,9 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
          rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
          if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
        }
+      if (custom_func != NULL) {
+        uc = custom_func(uc, custom_data);   /* user-specified custom mapping */
+      }
        decomp_result = utf8proc_decompose_char(
          uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
          &boundclass
@@ -683,15 +692,22 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
  
  UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
    const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
+) {
+    return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
+}
+
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
  ) {
    utf8proc_int32_t *buffer;
    utf8proc_ssize_t result;
    *dstptr = NULL;
-  result = utf8proc_decompose(str, strlen, NULL, 0, options);
+  result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
    if (result < 0) return result;
    buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
    if (!buffer) return UTF8PROC_ERROR_NOMEM;
-  result = utf8proc_decompose(str, strlen, buffer, result, options);
+  result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
    if (result < 0) {
      free(buffer);
      return result;
diff --git a/utf8proc.h b/utf8proc.h

index 9d1f782d351d18e98d513a0238997927a716b225..96328fb45056cf411c7b1d27d761884dfcb3d67e 100644 (file)
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -71,9 +71,9 @@
  /** The MAJOR version number (increased when backwards API compatibility is broken). */
  #define UTF8PROC_VERSION_MAJOR 2
  /** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
-#define UTF8PROC_VERSION_MINOR 0
+#define UTF8PROC_VERSION_MINOR 1
  /** The PATCH version (increased for fixes that do not change the API). */
-#define UTF8PROC_VERSION_PATCH 2
+#define UTF8PROC_VERSION_PATCH 0
  /** @} */
  
  #include <stdlib.h>
@@ -373,6 +373,13 @@ typedef enum {
    UTF8PROC_BOUNDCLASS_E_BASE_GAZ         = 18, /**< E_BASE + GLUE_AFTER_ZJW */
  } utf8proc_boundclass_t;
  
+/**
+ * Function pointer type passed to @ref utf8proc_map_custom and
+ * @ref utf8proc_decompose_custom, which is used to specify a user-defined
+ * mapping of codepoints to be applied in conjunction with other mappings.
+ */
+typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
+
  /**
   * Array containing the byte lengths of a UTF-8 encoded codepoint based
   * on the first byte.
@@ -480,6 +487,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
   * `buffer` (which must contain at least `bufsize` entries).  In case of
   * success, the number of codepoints written is returned; in case of an
   * error, a negative error code is returned (@ref utf8proc_errmsg).
+ * See @ref utf8proc_decompose_custom to supply additional transformations.
   *
   * If the number of written codepoints would be bigger than `bufsize`, the
   * required buffer size is returned, while the buffer will be overwritten with
@@ -490,6 +498,18 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
    utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
  );
  
+/**
+ * The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.  See also @ref utf8proc_map_custom.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
+  utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
  /**
   * Normalizes the sequence of `length` codepoints pointed to by `buffer`
   * in-place (i.e., the result is also stored in `buffer`).
@@ -623,7 +643,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi
   * in any case the result will be NULL terminated (though it might
   * contain NULL characters with the string if `str` contained NULL
   * characters). Other flags in the `options` field are passed to the
- * functions defined above, and regarded as described.
+ * functions defined above, and regarded as described.  See also
+ * @ref utfproc_map_custom to supply a custom codepoint transformation.
   *
   * In case of success the length of the new string is returned,
   * otherwise a negative error code is returned.
@@ -635,6 +656,17 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
    const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
  );
  
+/**
+ * Like @ref utf8proc_map, but also takes a `custom_func` mapping function
+ * that is called on each codepoint in `str` before any other transformations
+ * (along with a `custom_data` pointer that is passed through to `custom_func`).
+ * The `custom_func` argument is ignored if it is `NULL`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
+  const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
  /** @name Unicode normalization
   *
   * Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
author	Steven G. Johnson <stevenj@mit.edu>
	Wed, 30 Nov 2016 15:40:26 +0000 (10:40 -0500)
committer	GitHub <noreply@github.com>
	Wed, 30 Nov 2016 15:40:26 +0000 (10:40 -0500)
CMakeLists.txt		patch \| blob \| history
MANIFEST		patch \| blob \| history
Makefile		patch \| blob \| history
NEWS.md		patch \| blob \| history
test/custom.c	[new file with mode: 0644]	patch \| blob
utf8proc.c		patch \| blob \| history
utf8proc.h		patch \| blob \| history